Package org.terrier.structures.indexing

Source Code of org.terrier.structures.indexing.DocumentPostingList$postingIterator

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is DocumentPostingList.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*  
*/

package org.terrier.structures.indexing;

import gnu.trove.TObjectIntHashMap;
import gnu.trove.TObjectIntProcedure;

import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;

import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.postings.BasicPostingImpl;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.IterablePostingImpl;
import org.terrier.structures.postings.WritablePosting;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.TermCodes;
/** Represents the postings of one document. Uses HashMaps internally.
  * <p>
  * <b>Properties:</b><br>
  * <ul><li><tt>indexing.avg.unique.terms.per.doc</tt> - number of unique terms per doc on average, used to tune the initial
  * size of the hashmaps used in this class.</li></ul>
  */
public class DocumentPostingList {
  /** number of unique terms per doc on average, used to tune the initial size of the hashmaps used in this class. */
  protected static final int AVG_DOCUMENT_UNIQUE_TERMS =
    Integer.parseInt(ApplicationSetup.getProperty("indexing.avg.unique.terms.per.doc", "120"));

  /** length of the document so far. Sum of the term frequencies inserted so far. */
  protected int documentLength = 0;

  /** mapping term to tf mapping */ 
  protected final TObjectIntHashMap<String> occurrences = new TObjectIntHashMap<String>(AVG_DOCUMENT_UNIQUE_TERMS);
 
  /** Create a new DocumentPostingList object */
  public DocumentPostingList()
  {}
 
  /** Returns all terms in this posting list */
  public String[] termSet()
  {
    return occurrences.keys(new String[0]);
  }
 
  /** Return the frequency of the specified term in this document */
  public int getFrequency(String term)
  {
    return occurrences.get(term);
 

  /** Removes all postings from this document */
  public void clear()
  {
    occurrences.clear();
    documentLength = 0;
  }

  /** Returns the total number of tokens in this document */ 
  public int getDocumentLength()
  {
    return documentLength;
  }

  /** Returns the number of unique terms in this document. */
  public int getNumberOfPointers()
  {
    return occurrences.size();
  }
  /** Insert a term into the posting list of this document
    * @param term the Term being inserted */
  public void insert(final String term)
  {
    occurrences.adjustOrPutValue(term,1,1);
    documentLength++;
  }
 
  /** Insert a term into the posting list of this document
    * @param tf frequency
      * @param term the Term being inserted */
    public void insert(final int tf, final String term)
    {
        occurrences.adjustOrPutValue(term,tf,tf);
        documentLength++;
    }

    /** Return a DocumentIndexEntry for this document */
    public DocumentIndexEntry getDocumentStatistics()
  {
    DocumentIndexEntry die = new BasicDocumentIndexEntry();
    die.setDocumentLength(this.getDocumentLength());
    die.setNumberOfEntries(this.getNumberOfPointers());
    return die;
  }
   
    /** Execute the specifed method for each term. */
    public void forEachTerm(TObjectIntProcedure<String> proc)
    {
      this.occurrences.forEachEntry(proc);
    }
   
    /** Used by getPostings() and getPostings2() to obtain the term id of the term.
     * This implementation uses the TermCodes class. */
    protected int getTermId(String term)
    {
      return TermCodes.getCode(term);
    }

  /** Returns the postings suitable to be written into the direct index.
   * During this, TermIds are assigned. */
  public int[][] getPostings()
  {
    final int termCount = occurrences.size();
    final int[] termids = new int[termCount];
    final int[] tfs = new int[termCount];
    occurrences.forEachEntry( new TObjectIntProcedure<String>() {
      int i=0;
      public boolean execute(final String a, final int b)
      {
        termids[i] = getTermId(a);
        tfs[i++] = b;
        return true;
      }
    });
    HeapSortInt.ascendingHeapSort(termids, tfs);
    return new int[][]{termids, tfs};
  }
 
  /** Returns a posting iterator suitable to be written into the direct index.
   * During this, TermIds are assigned, using getTermId() method. */
  public IterablePosting getPostings2()
  {
    //obtain and sort termids by id
   
    final int termCount = occurrences.size();
    final TObjectIntHashMap<String> cache_termids = new TObjectIntHashMap<String>(termCount)
   
    occurrences.forEachEntry( new TObjectIntProcedure<String>() {
      public boolean execute(final String a, final int b)
      {
        cache_termids.put(a, getTermId(a));
        return true;
      }
    });
   
    final String[] terms = cache_termids.keys(new String[termCount]);
    Arrays.sort(terms, new Comparator<String>(){
      public int compare(String o1, String o2) {
        return cache_termids.get(o1) - cache_termids.get(o2);
      }     
    });
    final int[] termIds = new int[termCount];
    int i=0;
    for(String t : terms)
    {
      termIds[i++] = cache_termids.get(t);
    }
    return makePostingIterator(terms, termIds);
  }
 
  protected IterablePosting makePostingIterator(String[] _terms, int[] termIds)
  {
    return new postingIterator(_terms, termIds);
  }
 
  protected class postingIterator extends IterablePostingImpl
  {
    String[] terms;
    int[] termIds;
    int i = -1;
   
    public postingIterator(String[] _terms, int[] _termIds)
    {
      terms = _terms;
      termIds = _termIds;
    }
   
    public WritablePosting asWritablePosting() {
      return new BasicPostingImpl(termIds[i], getFrequency());
    }

    public int getDocumentLength() {
      return documentLength;
    }

    public int getFrequency() {
      return occurrences.get(terms[i]);
    }

    public int getId() {
      return termIds[i];
    }

    public void setId(int id) {
      termIds[i] = id;
    }

    public int next() throws IOException {
      if (i >= termIds.length -1)
        return EOL;
      i++;
      return termIds[i];
    }
   
    /** {@inheritDoc} */
    public boolean endOfPostings() {
      return (i >= termIds.length -1);
    }

    public void close() throws IOException {
      terms = null;
      termIds = null;
    }
  }
 
}
TOP

Related Classes of org.terrier.structures.indexing.DocumentPostingList$postingIterator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.